#Doing the Recency Frequency and monetory analysis of ecommerce dataset
df=read.csv("d:/Ecommerce Project 2.csv")
head(df)
## InvoiceNo StockCode Description Quantity InvoiceDate
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 29-Nov-16
## 2 536365 71053 WHITE METAL LANTERN 6 29-Nov-16
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 29-Nov-16
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 29-Nov-16
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 29-Nov-16
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 29-Nov-16
## UnitPrice CustomerID Country X
## 1 2.55 17850 United Kingdom NA
## 2 3.39 17850 United Kingdom NA
## 3 2.75 17850 United Kingdom NA
## 4 3.39 17850 United Kingdom NA
## 5 3.39 17850 United Kingdom NA
## 6 7.65 17850 United Kingdom NA
#displaying the stucture , the datatypes of ecom data frame
str(df)
## 'data.frame': 541909 obs. of 9 variables:
## $ InvoiceNo : chr "536365" "536365" "536365" "536365" ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr "29-Nov-16" "29-Nov-16" "29-Nov-16" "29-Nov-16" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : int 17850 17850 17850 17850 17850 17850 17850 17850 17850 13047 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ X : logi NA NA NA NA NA NA ...
#need to extract numbers from InvoiceNo,Stockcode,CustomerID #eliminating characters from invoice_no
inv=gsub('[A-Z]*','',df$InvoiceNo)
inv=as.numeric(inv)
df$InvoiceNo=inv
#eliminating characters from CustomerID
cid=gsub("[^[:alnum:]]", "",df$CustomerID)
cid=as.numeric(cid)
df$CustomerID=cid
#reviewing the structure of the data frame again
str(df)
## 'data.frame': 541909 obs. of 9 variables:
## $ InvoiceNo : num 536365 536365 536365 536365 536365 ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr "29-Nov-16" "29-Nov-16" "29-Nov-16" "29-Nov-16" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num 17850 17850 17850 17850 17850 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ X : logi NA NA NA NA NA NA ...
#Analysing the missing values
df_missing=sapply(df,function(x)(sum(is.na(x))))
df_missing[df_missing>0]
## CustomerID X
## 135080 541909
# dropping the column x as it has no values
df_missing
## InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice
## 0 0 0 0 0 0
## CustomerID Country X
## 135080 0 541909
df=df[,-9]
#removing the rows with null customer id
df=na.omit(df)
df_missing=sapply(df,function(x)(sum(is.na(x))))
df_missing[df_missing>0]
## named integer(0)
dim(df)
## [1] 406829 8
#creating a column total_price as quantity*unit_price
df$tot_price=df$Quantity*df$UnitPrice
str(df)
## 'data.frame': 406829 obs. of 9 variables:
## $ InvoiceNo : num 536365 536365 536365 536365 536365 ...
## $ StockCode : chr "85123A" "71053" "84406B" "84029G" ...
## $ Description: chr "WHITE HANGING HEART T-LIGHT HOLDER" "WHITE METAL LANTERN" "CREAM CUPID HEARTS COAT HANGER" "KNITTED UNION FLAG HOT WATER BOTTLE" ...
## $ Quantity : int 6 6 8 6 6 2 6 6 6 32 ...
## $ InvoiceDate: chr "29-Nov-16" "29-Nov-16" "29-Nov-16" "29-Nov-16" ...
## $ UnitPrice : num 2.55 3.39 2.75 3.39 3.39 7.65 4.25 1.85 1.85 1.69 ...
## $ CustomerID : num 17850 17850 17850 17850 17850 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ tot_price : num 15.3 20.3 22 20.3 20.3 ...
## - attr(*, "na.action")= 'omit' Named int [1:135080] 623 1444 1445 1446 1447 1448 1449 1450 1451 1452 ...
## ..- attr(*, "names")= chr [1:135080] "623" "1444" "1445" "1446" ...
View(df)
library(plyr)
## Warning: package 'plyr' was built under R version 4.0.5
df_country=count(df$Country)
df_country=df_country[order(df_country$freq),]
head(df_country)
## x freq
## 29 Saudi Arabia 10
## 3 Bahrain 17
## 9 Czech Republic 30
## 5 Brazil 32
## 22 Lithuania 35
## 21 Lebanon 45
#converting country to factor variables
df$Country=as.factor(df$Country)
#creating columns for calculating recency freqiuency and monetory analysis
#grouping the data by customer id ,how much the customer has spend
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_price=df%>%
group_by(CustomerID)%>%
summarise("tot"=sum(tot_price))
head(df_price)
## # A tibble: 6 x 2
## CustomerID tot
## <dbl> <dbl>
## 1 12346 0
## 2 12347 4310
## 3 12348 1797.
## 4 12349 1758.
## 5 12350 334.
## 6 12352 1545.
nrow(df_price)
## [1] 4372
#grouping the data by customer id ,based on how many times they have transacted
library(dplyr)
# df_inv_cid=df %>%
# group_by(CustomerID)%>%
# count(InvoiceNo)
df_txn=df %>%
group_by(CustomerID) %>%
summarise(count=n())
head(df_txn)
## # A tibble: 6 x 2
## CustomerID count
## <dbl> <int>
## 1 12346 2
## 2 12347 182
## 3 12348 31
## 4 12349 73
## 5 12350 17
## 6 12352 95
nrow(df_txn)
## [1] 4372
#separating date into day month and year
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
df2=separate(df,"InvoiceDate",into=c("Day","Month","Year"),sep="-")
head(df2)
## InvoiceNo StockCode Description Quantity Day Month
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 29 Nov
## 2 536365 71053 WHITE METAL LANTERN 6 29 Nov
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 29 Nov
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 29 Nov
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 29 Nov
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 29 Nov
## Year UnitPrice CustomerID Country tot_price
## 1 16 2.55 17850 United Kingdom 15.30
## 2 16 3.39 17850 United Kingdom 20.34
## 3 16 2.75 17850 United Kingdom 22.00
## 4 16 3.39 17850 United Kingdom 20.34
## 5 16 3.39 17850 United Kingdom 20.34
## 6 16 7.65 17850 United Kingdom 15.30
unique(df2$Month)
## [1] "Nov" "Dec" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct"
#the total transactions by month
library(dplyr)
install.packages("dplyr")
## Warning: package 'dplyr' is in use and will not be installed
df2%>%
group_by(Month)%>%
summarise(sum(tot_price))
## # A tibble: 12 x 2
## Month `sum(tot_price)`
## <chr> <dbl>
## 1 Apr 409698.
## 2 Aug 643654.
## 3 Dec 717385.
## 4 Feb 434829.
## 5 Jan 517833.
## 6 Jul 602283.
## 7 Jun 576932.
## 8 Mar 562237.
## 9 May 684053.
## 10 Nov 1245328.
## 11 Oct 1029836.
## 12 Sep 875996.
df2%>%
group_by(Year)%>%
summarise(sum(tot_price))
## # A tibble: 2 x 2
## Year `sum(tot_price)`
## <chr> <dbl>
## 1 16 554604.
## 2 17 7745462.
#calculating recency of each customer
head(df)
## InvoiceNo StockCode Description Quantity InvoiceDate
## 1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 29-Nov-16
## 2 536365 71053 WHITE METAL LANTERN 6 29-Nov-16
## 3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 29-Nov-16
## 4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 29-Nov-16
## 5 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 29-Nov-16
## 6 536365 22752 SET 7 BABUSHKA NESTING BOXES 2 29-Nov-16
## UnitPrice CustomerID Country tot_price
## 1 2.55 17850 United Kingdom 15.30
## 2 3.39 17850 United Kingdom 20.34
## 3 2.75 17850 United Kingdom 22.00
## 4 3.39 17850 United Kingdom 20.34
## 5 3.39 17850 United Kingdom 20.34
## 6 7.65 17850 United Kingdom 15.30
df$InvoiceDate=as.Date(df$InvoiceDate,format="%d-%b-%y")
max(df$InvoiceDate)
## [1] "2017-12-07"
nrow(df)
## [1] 406829
library(dplyr)
df_date=df%>%
group_by(CustomerID)%>%
summarise(max(InvoiceDate))
nrow(df_date)
## [1] 4372
df_date$diff_in_days = difftime( max(df$InvoiceDate),df_date$`max(InvoiceDate)`, units = "days")
df_date
## # A tibble: 4,372 x 3
## CustomerID `max(InvoiceDate)` diff_in_days
## <dbl> <date> <drtn>
## 1 12346 2017-01-16 325 days
## 2 12347 2017-12-05 2 days
## 3 12348 2017-09-23 75 days
## 4 12349 2017-11-19 18 days
## 5 12350 2017-01-31 310 days
## 6 12352 2017-11-01 36 days
## 7 12353 2017-05-17 204 days
## 8 12354 2017-04-19 232 days
## 9 12355 2017-05-07 214 days
## 10 12356 2017-11-15 22 days
## # ... with 4,362 more rows
#combining recency, frequency and monetory parameters in a single data frame
rfm_matrix=data.frame(cbind("custid"=df_price$CustomerID,"recency"=df_date$diff_in_days,"frequency"=df_txn$count,"monetory"=df_price$tot))
nrow(rfm_matrix)
## [1] 4372
head(rfm_matrix)
## custid recency frequency monetory
## 1 12346 325 2 0.00
## 2 12347 2 182 4310.00
## 3 12348 75 31 1797.24
## 4 12349 18 73 1757.55
## 5 12350 310 17 334.40
## 6 12352 36 95 1545.41
#install.packages("OneR")
library("OneR")
## Warning: package 'OneR' was built under R version 4.0.5
head(rfm_matrix$monetory)
## [1] 0.00 4310.00 1797.24 1757.55 334.40 1545.41
min(rfm_matrix$monetory)
## [1] -4287.63
max(rfm_matrix$monetory)
## [1] 279489
#creating bins for each variables recency, frequency and monetory
#creating bins for monetory
summary(rfm_matrix$monetory)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4287.6 293.4 648.1 1898.5 1611.7 279489.0
b <- c(-Inf,293.4,648.1,1898.5,1611.7,Inf)
names <- c("1", "2", "3","4","5")
rfm_matrix<-rfm_matrix%>%mutate(mon_bins = cut(monetory, breaks = b,labels=names))
head(rfm_matrix)
## custid recency frequency monetory mon_bins
## 1 12346 325 2 0.00 1
## 2 12347 2 182 4310.00 5
## 3 12348 75 31 1797.24 4
## 4 12349 18 73 1757.55 4
## 5 12350 310 17 334.40 2
## 6 12352 36 95 1545.41 3
#creating bins for recency
summary(rfm_matrix$recency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 16.00 50.00 91.58 143.00 373.00
b <- c(-Inf,16,50,91.58,143,Inf)
names <- c("5", "4", "3","2","1")
rfm_matrix<-rfm_matrix%>%mutate(rec_bins = cut(recency, breaks = b,labels=names))
head(rfm_matrix)
## custid recency frequency monetory mon_bins rec_bins
## 1 12346 325 2 0.00 1 1
## 2 12347 2 182 4310.00 5 5
## 3 12348 75 31 1797.24 4 3
## 4 12349 18 73 1757.55 4 4
## 5 12350 310 17 334.40 2 1
## 6 12352 36 95 1545.41 3 4
#creating bins for frequency
summary(rfm_matrix$frequency)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 17.00 42.00 93.05 102.00 7983.00
b <- c(-Inf,17,42,93.5,102,Inf)
names <- c("1", "2", "3","4","5")
rfm_matrix<-rfm_matrix%>%mutate(freq_bins = cut(frequency, breaks = b,labels=names))
head(rfm_matrix)
## custid recency frequency monetory mon_bins rec_bins freq_bins
## 1 12346 325 2 0.00 1 1 1
## 2 12347 2 182 4310.00 5 5 5
## 3 12348 75 31 1797.24 4 3 2
## 4 12349 18 73 1757.55 4 4 3
## 5 12350 310 17 334.40 2 1 1
## 6 12352 36 95 1545.41 3 4 4
rfm_bin=rfm_matrix[,c(1,5,7,6)]
head(rfm_bin)
## custid mon_bins freq_bins rec_bins
## 1 12346 1 1 1
## 2 12347 5 5 5
## 3 12348 4 2 3
## 4 12349 4 3 4
## 5 12350 2 1 1
## 6 12352 3 4 4
str(rfm_bin)
## 'data.frame': 4372 obs. of 4 variables:
## $ custid : num 12346 12347 12348 12349 12350 ...
## $ mon_bins : Factor w/ 5 levels "1","2","3","4",..: 1 5 4 4 2 3 1 3 2 5 ...
## $ freq_bins: Factor w/ 5 levels "1","2","3","4",..: 1 5 2 3 1 4 1 3 1 3 ...
## $ rec_bins : Factor w/ 5 levels "5","4","3","2",..: 5 1 3 2 5 2 5 5 5 2 ...
rfm_bin$mon_bins=as.numeric(rfm_bin$mon_bins)
rfm_bin$rec_bins=as.numeric(rfm_bin$rec_bins)
rfm_bin$freq_bins=as.numeric(rfm_bin$freq_bins)
head(rfm_bin)
## custid mon_bins freq_bins rec_bins
## 1 12346 1 1 5
## 2 12347 5 5 1
## 3 12348 4 2 3
## 4 12349 4 3 2
## 5 12350 2 1 5
## 6 12352 3 4 2
head(rfm_bin)
## custid mon_bins freq_bins rec_bins
## 1 12346 1 1 5
## 2 12347 5 5 1
## 3 12348 4 2 3
## 4 12349 4 3 2
## 5 12350 2 1 5
## 6 12352 3 4 2
str(rfm_bin)
## 'data.frame': 4372 obs. of 4 variables:
## $ custid : num 12346 12347 12348 12349 12350 ...
## $ mon_bins : num 1 5 4 4 2 3 1 3 2 5 ...
## $ freq_bins: num 1 5 2 3 1 4 1 3 1 3 ...
## $ rec_bins : num 5 1 3 2 5 2 5 5 5 2 ...
head(rfm_bin)
## custid mon_bins freq_bins rec_bins
## 1 12346 1 1 5
## 2 12347 5 5 1
## 3 12348 4 2 3
## 4 12349 4 3 2
## 5 12350 2 1 5
## 6 12352 3 4 2
str(rfm_bin)
## 'data.frame': 4372 obs. of 4 variables:
## $ custid : num 12346 12347 12348 12349 12350 ...
## $ mon_bins : num 1 5 4 4 2 3 1 3 2 5 ...
## $ freq_bins: num 1 5 2 3 1 4 1 3 1 3 ...
## $ rec_bins : num 5 1 3 2 5 2 5 5 5 2 ...
rfm_bin$tot_score=(rfm_bin$rec_bins+rfm_bin$freq_bins+rfm_bin$mon_bins)/3
nrow(rfm_bin)
## [1] 4372
nrow(df)
## [1] 406829
#clustering based on RFM analysis
#Creating an elbow plot
n=1:15
wss=function(k){
kmod=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],k)
kmod$tot.withinss
}
library(tidyverse)
wss_values=map_dbl(n,wss)
wss_values
## [1] 28750.382 13685.063 11932.020 7141.264 6547.659 5536.572 4900.850
## [8] 4491.803 4358.161 3472.163 3376.190 3342.792 2739.297 3221.899
## [15] 2949.219
plot(n, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
col="red",
ylab="Total within-clusters sum of squares")
#making clusters by k=3
k_mod3=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=3)
df_mod3=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod3$cluster)
df_mod3$cluster=as.factor(df_mod3$cluster)
head(df_mod3)
## rec_bins freq_bins mon_bins cluster
## 1 5 1 1 3
## 2 1 5 5 2
## 3 3 2 4 1
## 4 2 3 4 2
## 5 5 1 2 3
## 6 2 4 3 2
Accuracy of the cluster with number of clusters 3 =0.6963555
#making clusters by k=4
head(rfm_bin)
## custid mon_bins freq_bins rec_bins tot_score
## 1 12346 1 1 5 2.333333
## 2 12347 5 5 1 3.666667
## 3 12348 4 2 3 3.000000
## 4 12349 4 3 2 3.000000
## 5 12350 2 1 5 2.666667
## 6 12352 3 4 2 3.000000
k_mod4=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=4)
df_mod4=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod4$cluster)
df_mod4$cluster=as.factor(df_mod4$cluster)
head(df_mod4,20)
## rec_bins freq_bins mon_bins cluster
## 1 5 1 1 3
## 2 1 5 5 2
## 3 3 2 4 4
## 4 2 3 4 4
## 5 5 1 2 3
## 6 2 4 3 2
## 7 5 1 1 3
## 8 5 3 3 3
## 9 5 1 2 3
## 10 2 3 5 4
## 11 2 5 5 2
## 12 1 2 3 1
## 13 1 5 5 2
## 14 3 5 5 2
## 15 5 1 1 3
## 16 1 5 5 2
## 17 4 2 2 3
## 18 1 3 3 1
## 19 5 2 2 3
## 20 1 1 1 1
Accuracy of the cluster with number of clusters 4 =0.7284862
#making clusters by k=6
set.seed(100)
k_mod6=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=6)
df_mod6=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod6$cluster)
df_mod6$cluster=as.factor(df_mod6$cluster)
head(df_mod6)
## rec_bins freq_bins mon_bins cluster
## 1 5 1 1 1
## 2 1 5 5 6
## 3 3 2 4 3
## 4 2 3 4 3
## 5 5 1 2 1
## 6 2 4 3 2
head(rfm_bin)
## custid mon_bins freq_bins rec_bins tot_score
## 1 12346 1 1 5 2.333333
## 2 12347 5 5 1 3.666667
## 3 12348 4 2 3 3.000000
## 4 12349 4 3 2 3.000000
## 5 12350 2 1 5 2.666667
## 6 12352 3 4 2 3.000000
Accuracy of the cluster with number of clusters 6=0.8156666
#scaling
summary(df_mod6)
## rec_bins freq_bins mon_bins cluster
## Min. :1.00 Min. :1.00 Min. :1.000 1:1054
## 1st Qu.:1.00 1st Qu.:1.00 1st Qu.:1.750 2: 690
## Median :2.00 Median :2.00 Median :2.500 3: 212
## Mean :2.82 Mean :2.76 Mean :2.711 4: 574
## 3rd Qu.:4.00 3rd Qu.:4.00 3rd Qu.:3.250 5: 831
## Max. :5.00 Max. :5.00 Max. :5.000 6:1011
df_mod6_scale=scale(df_mod6[,c("rec_bins","freq_bins","mon_bins")],center=TRUE,scale=TRUE)
summary(df_mod6_scale)
## rec_bins freq_bins mon_bins
## Min. :-1.1989 Min. :-1.1800 Min. :-1.1951
## 1st Qu.:-1.1989 1st Qu.:-1.1800 1st Qu.:-0.6712
## Median :-0.5400 Median :-0.5097 Median :-0.1472
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7778 3rd Qu.: 0.8310 3rd Qu.: 0.3768
## Max. : 1.4367 Max. : 1.5014 Max. : 1.5994
kmod_61=kmeans(df_mod6_scale,centers=6)
Accuracy of the cluster with number of clusters 6 after normalisation=0.8232957
#In the resulting plot, observations are represented by points, # using principal components if the number of variables is greater than 2. #It’s also possible to draw concentration ellipse around each cluster.
#install.packages("ggpubr")
#install.packages("factoextra")
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.0.5
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#Compute PCA and extract individual coordinates
res.pca <- prcomp(df_mod6[,c("rec_bins","freq_bins","mon_bins")], scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(df_mod6$cluster)
# Data inspection
k_mod6=kmeans(rfm_bin[,c("mon_bins","rec_bins","freq_bins")],centers=6)
df_mod6=cbind(rfm_bin[,c("mon_bins","rec_bins","freq_bins")],"cluster"=k_mod6$cluster)
df_mod6$cluster=as.factor(df_mod6$cluster)
head(df_mod6)
## mon_bins rec_bins freq_bins cluster
## 1 1 5 1 5
## 2 5 1 5 4
## 3 4 3 2 6
## 4 4 2 3 6
## 5 2 5 1 5
## 6 3 2 4 3
res.pca <- prcomp(df_mod6[,c("mon_bins","rec_bins","freq_bins")], scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(df_mod6$cluster)
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.1 71.5 71.5
## Dim.2 0.6 20.5 92.0
## Dim.3 0.2 8.0 100.0
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
size = 1.5, legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) + stat_mean(aes(color = cluster), size = 4)
# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.1 71.5 71.5
## Dim.2 0.6 20.5 92.0
## Dim.3 0.2 8.0 100.0
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
size = 1.5, legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
stat_mean(aes(color = cluster), size = 4)
library(ggplot2)
head(k_mod6)
## $cluster
## [1] 5 4 6 6 5 3 5 1 5 6 4 3 4 4 5 4 1 3 1 2 4 6 3 5 3 2 1 4 3 4 6 1 2 5 4 2 3
## [38] 3 3 4 4 6 1 5 5 2 1 4 6 4 4 1 3 3 1 4 4 1 2 3 1 4 1 3 1 3 4 4 2 4 4 4 3 6
## [75] 5 4 4 5 2 4 2 3 1 2 4 5 4 2 1 2 6 4 6 3 1 3 3 3 3 1 4 4 4 4 3 4 4 3 2 4 4
## [112] 6 4 3 5 4 2 2 1 3 2 2 6 4 4 5 2 5 5 1 3 5 1 2 2 1 5 3 4 4 2 4 1 2 4 4 3 2
## [149] 4 6 2 2 1 3 4 2 4 4 3 2 3 3 5 5 3 5 2 4 3 6 2 1 4 5 4 1 5 2 4 4 2 3 5 5 3
## [186] 3 4 2 1 2 1 4 4 1 2 2 2 2 1 1 1 1 4 2 4 4 3 6 1 2 2 2 5 3 3 4 3 2 4 1 4 3
## [223] 2 2 4 3 4 5 1 1 1 4 4 3 3 2 4 4 4 3 1 3 5 3 6 1 6 3 4 1 1 2 5 1 1 2 4 3 3
## [260] 2 2 4 6 5 5 2 4 1 4 5 2 6 3 4 3 3 4 4 4 4 4 5 4 3 5 3 1 3 2 3 4 3 2 2 6 4
## [297] 1 4 4 1 4 3 4 1 3 1 2 2 4 4 3 3 3 1 4 4 3 1 4 5 1 5 1 5 5 3 2 1 4 4 4 4 5
## [334] 6 4 1 1 2 4 6 3 6 1 1 4 1 1 3 3 3 4 3 4 3 3 2 2 2 1 5 1 1 3 2 5 6 1 2 1 1
## [371] 2 5 1 1 5 1 3 5 3 6 2 3 2 3 5 6 5 2 1 1 4 5 4 4 4 4 3 4 3 1 3 3 1 1 1 4 5
## [408] 4 1 2 5 3 1 4 1 5 2 1 5 5 6 4 3 2 5 2 3 2 3 3 5 3 5 2 2 5 4 5 3 6 2 4 6 6
## [445] 6 1 4 5 3 2 4 1 2 3 3 6 5 2 4 3 4 3 4 2 6 4 1 5 2 1 1 4 4 6 3 3 2 4 5 4 2
## [482] 4 4 2 3 5 3 4 5 1 5 1 2 1 2 3 2 1 6 2 2 2 3 2 3 1 1 4 5 5 4 1 2 5 5 4 4 4
## [519] 4 3 2 4 4 3 3 6 3 2 3 3 5 2 3 1 3 3 5 1 1 2 4 4 2 4 1 3 2 5 5 2 5 3 5 3 2
## [556] 2 4 5 5 1 4 4 2 5 4 6 2 5 4 4 2 2 4 6 2 4 4 5 5 4 2 3 5 3 5 3 4 4 4 1 3 3
## [593] 5 1 1 4 4 2 5 3 5 5 4 1 4 4 3 4 1 4 2 5 2 2 2 4 6 1 2 5 3 6 1 4 2 5 1 3 1
## [630] 3 4 2 2 3 3 4 3 2 4 2 4 4 5 3 5 2 2 1 2 5 4 4 4 3 3 2 4 4 1 3 4 3 5 5 5 3
## [667] 1 5 4 6 5 5 2 5 4 4 3 1 3 3 1 1 5 1 5 2 4 2 1 1 1 2 2 3 2 2 1 5 4 6 4 4 4
## [704] 4 5 3 2 5 2 2 3 2 1 3 2 5 4 1 2 1 5 5 6 2 5 1 1 5 1 6 5 3 4 3 2 3 3 6 4 2
## [741] 4 4 2 2 3 6 2 4 1 3 1 2 4 3 1 1 4 1 1 4 1 2 2 2 3 2 2 3 1 3 1 1 2 4 3 2 4
## [778] 2 1 5 1 2 4 3 1 3 4 5 4 1 1 3 2 5 4 1 3 3 1 1 2 2 2 4 3 5 1 3 2 4 2 5 4 4
## [815] 1 4 2 2 3 3 3 2 4 2 4 1 2 3 4 2 6 4 5 5 3 6 4 2 3 5 3 1 3 4 5 4 2 1 1 1 3
## [852] 5 4 5 2 5 1 3 2 4 5 3 1 5 1 6 2 1 3 3 1 2 2 2 4 1 4 5 3 4 5 5 2 3 3 4 3 2
## [889] 4 4 2 4 3 2 4 2 1 2 1 4 4 3 2 2 3 1 3 5 4 4 2 3 2 5 6 3 1 2 4 1 3 5 4 1 2
## [926] 3 4 6 2 5 1 5 2 4 2 6 3 2 4 2 4 2 4 5 3 2 2 2 3 4 3 4 1 3 5 5 3 5 4 6 6 6
## [963] 4 4 3 3 3 2 3 1 2 4 5 6 1 4 2 3 4 1 1 2 4 1 3 5 2 1 4 2 2 5 3 1 5 4 3 5 5
## [1000] 6 5 2 2 3 5 4 6 2 5 4 5 4 3 1 1 1 4 2 5 5 1 5 4 5 3 1 2 6 5 4 3 2 3 1 1 2
## [1037] 4 4 1 2 1 3 4 3 1 5 5 1 6 5 1 1 5 4 4 4 3 2 3 5 6 4 4 4 4 4 2 4 5 3 5 4 6
## [1074] 5 2 3 2 2 4 1 1 2 6 5 4 2 2 5 6 5 3 5 2 3 3 4 2 2 1 1 3 3 6 5 3 2 5 6 2 1
## [1111] 3 5 4 5 4 2 2 3 1 3 3 6 3 4 5 2 5 3 4 1 2 2 3 6 4 4 4 1 1 4 2 2 3 4 4 3 4
## [1148] 3 1 2 1 1 5 4 2 1 2 4 2 5 2 1 4 2 4 4 2 2 3 2 5 2 4 3 2 2 2 1 2 3 5 1 1 1
## [1185] 5 2 3 3 6 2 2 5 4 3 1 4 6 2 1 4 5 3 2 2 1 5 4 4 2 5 2 4 5 2 3 2 4 4 2 4 2
## [1222] 4 3 5 3 2 4 4 3 4 5 6 1 4 6 1 1 5 1 2 3 4 3 2 1 1 2 5 2 2 4 4 4 3 3 1 5 3
## [1259] 2 4 5 4 1 5 3 1 4 3 4 2 5 2 1 4 4 5 4 4 3 5 3 5 4 2 3 1 2 1 5 2 1 5 4 4 2
## [1296] 4 2 5 4 3 4 2 4 5 6 3 5 3 4 2 2 4 2 4 3 5 3 5 5 4 2 4 2 1 3 3 1 4 1 4 3 2
## [1333] 3 3 1 5 6 4 2 5 5 3 3 1 5 4 3 2 4 1 5 6 6 5 3 1 2 2 4 1 3 4 4 4 2 5 6 4 4
## [1370] 3 4 1 5 3 5 3 2 2 1 5 6 4 4 2 5 3 4 6 4 2 2 1 4 1 1 1 6 4 3 1 4 3 4 6 2 5
## [1407] 5 4 3 5 5 4 1 1 5 5 4 6 1 4 4 5 3 4 1 1 1 1 5 2 2 3 4 5 4 2 6 4 2 5 3 4 4
## [1444] 4 3 2 3 4 4 2 5 5 4 6 4 4 5 3 5 4 2 1 4 1 5 5 4 1 3 2 4 3 6 1 5 5 4 2 1 2
## [1481] 1 1 2 5 5 1 5 2 5 3 2 3 4 2 6 4 5 5 5 1 5 3 2 3 3 2 5 4 4 3 4 5 4 3 4 1 4
## [1518] 1 4 5 6 4 6 3 1 3 5 3 4 2 4 4 2 3 4 5 2 4 2 4 1 4 2 5 5 5 1 6 3 3 3 4 3 2
## [1555] 4 1 4 2 5 4 5 1 5 4 4 5 1 6 3 3 4 2 2 3 5 2 1 2 1 4 5 2 2 4 3 1 5 4 2 2 1
## [1592] 1 4 4 2 2 4 4 4 4 4 3 2 3 5 1 3 4 2 2 2 2 5 4 4 4 2 3 4 1 6 4 4 5 2 2 2 3
## [1629] 4 5 4 4 4 4 1 1 3 2 1 4 4 2 2 6 3 4 4 1 3 2 5 4 1 5 4 2 2 2 4 1 2 5 4 5 4
## [1666] 3 3 3 4 3 2 1 2 1 4 4 6 2 5 2 1 1 4 1 4 2 1 4 5 3 4 3 5 1 2 2 4 4 2 2 1 3
## [1703] 1 4 5 4 3 2 4 6 1 4 2 4 2 2 4 3 4 2 4 1 1 1 6 3 6 5 4 2 5 4 5 4 5 3 6 1 5
## [1740] 4 5 4 5 4 4 4 2 4 5 3 4 3 4 3 4 2 3 3 4 3 5 4 3 5 1 4 3 4 4 4 2 3 6 1 3 3
## [1777] 1 1 3 2 2 2 4 3 2 1 4 5 2 1 2 2 4 2 4 1 5 3 4 2 1 1 2 5 2 2 3 2 3 2 2 3 5
## [1814] 4 2 2 4 2 1 2 3 2 4 4 5 3 5 1 3 2 2 5 1 3 4 1 3 2 6 2 4 5 4 4 4 5 4 5 2 4
## [1851] 2 4 1 3 3 3 2 2 3 2 2 4 1 4 3 5 3 1 3 4 3 2 2 3 1 1 6 1 5 5 1 3 3 4 1 3 4
## [1888] 3 2 4 3 1 4 2 2 4 3 3 2 3 1 1 3 3 5 2 4 3 5 6 1 4 6 4 2 2 3 2 2 4 2 3 5 1
## [1925] 3 2 4 5 4 5 1 2 2 2 3 4 5 1 2 2 3 4 5 2 1 2 1 2 2 1 3 4 1 4 3 5 4 2 3 2 5
## [1962] 2 2 1 4 4 4 4 3 3 3 3 2 4 1 4 4 2 3 4 4 1 5 2 4 5 4 1 3 5 6 4 5 3 3 1 4 4
## [1999] 3 4 3 5 4 3 6 3 1 5 1 3 1 3 1 4 1 2 5 5 3 4 1 2 1 4 1 2 2 1 2 5 2 3 1 4 3
## [2036] 5 1 1 3 3 4 4 2 3 3 5 5 1 2 6 4 2 4 2 4 6 1 3 1 3 5 3 5 6 2 5 4 3 1 1 2 1
## [2073] 4 4 2 2 3 4 2 4 5 2 4 1 5 2 1 4 5 2 5 4 5 3 1 3 2 2 6 4 1 2 4 6 3 5 2 3 4
## [2110] 5 3 2 3 1 3 5 1 4 3 2 4 5 4 5 5 1 5 1 5 3 4 1 1 5 4 5 3 3 3 6 4 2 2 6 1 5
## [2147] 4 4 3 3 1 3 5 5 1 1 5 5 5 2 1 5 6 2 6 4 3 3 2 2 2 5 5 4 2 2 6 3 4 4 5 4 2
## [2184] 4 6 2 4 1 3 3 5 1 4 3 5 1 2 5 2 5 2 4 3 1 1 2 3 2 1 1 1 2 3 6 1 2 2 2 5 5
## [2221] 2 1 5 4 1 2 4 4 2 5 5 1 4 3 4 2 5 4 6 4 5 4 4 6 5 4 1 2 1 5 5 3 2 6 5 5 5
## [2258] 3 3 1 1 5 3 4 5 4 3 5 3 1 4 2 2 3 5 2 5 4 4 2 3 2 3 3 2 4 1 5 3 5 4 4 5 5
## [2295] 1 3 2 3 4 5 1 1 2 1 5 2 1 4 3 3 2 2 3 1 2 2 3 1 4 1 4 6 5 3 4 3 4 4 2 4 3
## [2332] 4 1 2 4 4 1 1 5 4 5 4 4 2 5 4 2 4 3 2 2 4 2 4 5 4 4 4 4 3 1 5 2 2 5 5 4 2
## [2369] 4 4 3 2 5 3 5 4 2 2 3 5 4 2 2 3 1 4 3 4 3 1 3 5 5 3 4 3 4 1 2 1 5 2 2 4 1
## [2406] 2 2 3 4 3 2 2 3 6 5 3 5 2 3 2 4 3 2 4 3 4 1 1 1 3 4 3 4 2 3 2 5 1 4 4 1 1
## [2443] 3 1 1 1 2 2 1 3 1 3 2 1 2 3 5 5 2 4 2 5 3 5 6 4 4 2 4 3 2 4 3 3 5 4 5 2 5
## [2480] 2 6 1 4 2 1 4 5 4 5 3 4 4 5 3 1 2 1 5 4 3 3 6 5 5 2 4 2 5 3 5 5 1 3 4 2 2
## [2517] 3 2 3 5 3 2 1 4 4 5 2 4 3 3 2 5 2 3 4 2 1 4 2 2 5 5 1 1 2 6 2 6 4 4 1 2 3
## [2554] 2 1 4 6 3 1 3 5 2 4 4 3 3 4 3 4 4 1 1 6 6 4 5 3 3 5 1 6 4 1 2 3 1 4 3 2 2
## [2591] 4 3 2 3 5 2 4 2 3 5 2 4 1 4 2 4 4 4 4 2 6 3 1 4 5 3 3 4 3 1 1 1 3 1 2 2 1
## [2628] 1 5 5 5 3 5 3 3 6 2 4 4 5 4 5 4 5 1 3 1 1 5 5 5 2 4 3 5 2 1 6 5 5 5 5 6 3
## [2665] 2 4 3 3 4 4 2 2 2 4 3 2 5 6 5 4 2 1 3 3 6 4 4 4 1 2 4 2 3 2 4 2 4 4 6 4 3
## [2702] 5 4 2 3 1 4 3 4 4 4 4 2 2 4 3 1 3 2 2 2 2 4 1 1 4 1 3 1 2 3 6 3 3 1 2 4 5
## [2739] 2 3 1 5 3 4 4 5 3 1 1 3 3 2 1 5 4 5 3 2 4 1 1 4 3 1 1 5 3 5 5 6 3 3 6 3 2
## [2776] 5 2 6 3 1 4 5 5 5 4 1 5 2 3 4 3 2 5 5 4 3 6 6 4 2 1 3 5 2 3 2 1 4 5 4 1 1
## [2813] 5 2 4 1 6 4 2 5 3 4 5 5 3 4 6 4 2 1 3 1 1 2 5 2 4 2 6 2 3 2 4 4 3 2 2 4 1
## [2850] 2 4 4 1 2 3 3 3 2 3 6 4 2 3 2 1 2 1 5 6 3 1 3 4 5 5 3 1 3 2 3 2 2 4 4 4 3
## [2887] 1 3 5 2 6 1 5 4 4 1 5 4 3 4 5 4 3 3 1 4 1 1 1 1 2 4 6 5 4 4 5 5 4 3 5 2 5
## [2924] 4 1 1 6 3 2 2 5 4 1 2 5 1 2 2 5 1 2 4 4 2 4 6 2 5 5 1 6 2 3 5 2 3 4 5 3 5
## [2961] 2 4 2 5 3 4 3 4 4 3 2 3 3 2 4 3 4 4 3 3 4 1 1 4 6 3 2 3 3 5 3 1 4 3 4 1 3
## [2998] 1 4 1 5 2 5 2 4 4 2 2 2 2 2 3 3 1 4 2 5 1 4 5 2 5 2 1 2 2 4 4 4 2 3 3 2 2
## [3035] 5 5 1 5 2 2 4 4 5 4 4 5 2 3 4 4 2 4 4 5 1 4 2 4 2 2 3 2 5 1 2 2 3 3 1 3 3
## [3072] 4 5 1 2 1 3 2 2 5 5 4 3 4 2 5 2 2 1 4 4 5 2 2 1 3 6 4 2 1 3 2 5 5 4 4 1 5
## [3109] 1 3 2 4 2 4 3 2 3 5 2 1 2 4 4 1 3 2 5 5 5 5 3 5 5 2 2 4 2 2 3 2 2 5 3 2 2
## [3146] 2 4 4 5 5 3 2 1 2 5 4 5 2 3 5 3 1 4 2 4 3 4 5 3 4 3 5 3 1 1 5 2 2 4 3 1 4
## [3183] 6 4 5 5 2 5 2 3 2 4 4 6 4 3 3 4 5 2 6 3 4 3 4 3 2 2 1 3 1 5 1 4 4 4 3 3 6
## [3220] 4 6 4 4 1 2 5 4 1 4 2 5 4 3 1 4 1 4 4 3 2 3 4 2 5 5 2 4 4 4 4 3 3 3 6 5 2
## [3257] 1 1 5 5 2 3 5 3 1 4 5 1 4 3 1 4 4 2 3 4 3 2 2 4 1 1 4 2 1 2 3 4 5 2 4 1 2
## [3294] 4 1 2 3 1 1 1 5 4 5 2 5 4 4 1 3 4 1 1 2 3 2 5 5 2 5 3 1 5 2 4 1 4 4 3 4 2
## [3331] 2 2 5 1 2 2 2 2 5 1 2 2 5 4 4 2 4 4 4 2 4 2 3 5 2 3 2 1 4 3 1 2 1 3 4 2 1
## [3368] 3 4 4 3 4 2 4 4 5 2 3 3 5 4 4 2 4 3 5 5 4 3 3 4 2 3 3 2 4 4 4 1 4 1 1 3 3
## [3405] 1 5 2 6 1 2 5 2 3 5 5 5 6 1 2 3 2 5 2 6 1 2 4 4 4 6 4 2 2 5 3 5 3 1 5 2 4
## [3442] 3 3 3 3 2 2 3 4 4 2 4 2 2 2 5 4 1 2 2 3 3 1 3 2 5 3 3 4 2 1 1 3 1 4 4 1 3
## [3479] 5 3 3 4 5 4 1 3 3 5 4 4 1 4 4 3 3 2 2 2 2 6 1 4 1 4 4 6 4 4 5 3 3 4 3 6 5
## [3516] 1 4 4 5 2 4 3 5 5 2 1 5 5 1 2 1 5 4 6 1 3 1 4 1 5 3 1 6 2 1 2 2 2 2 3 3 4
## [3553] 4 2 4 5 2 3 5 2 6 5 3 5 1 4 2 6 2 5 6 1 2 4 6 5 3 3 2 4 1 2 2 4 5 4 4 5 4
## [3590] 2 4 3 2 1 1 3 4 1 1 4 2 2 5 2 4 2 4 5 1 4 3 5 2 1 3 5 1 2 3 3 5 1 2 5 1 3
## [3627] 1 1 4 2 1 5 2 1 4 2 4 4 4 3 5 4 3 2 1 3 2 3 1 6 5 5 3 5 6 4 4 6 1 4 4 3 5
## [3664] 2 2 5 2 5 4 4 5 4 1 5 2 3 4 2 4 5 3 2 1 2 5 3 1 2 3 2 1 4 4 5 4 2 3 4 3 1
## [3701] 1 2 4 3 4 2 2 2 2 6 3 2 4 5 1 5 6 3 1 4 6 2 4 3 4 5 1 3 3 3 2 2 4 2 4 2 4
## [3738] 3 3 4 2 4 1 2 1 3 3 4 2 1 5 4 5 4 5 2 5 4 4 4 2 1 5 5 1 3 4 2 4 3 4 5 3 2
## [3775] 6 3 1 2 2 3 2 1 2 6 4 5 2 5 2 1 2 5 3 2 4 1 2 1 1 4 2 4 4 2 5 3 4 5 6 2 3
## [3812] 3 3 5 5 3 3 4 4 5 2 5 5 3 2 2 5 3 5 5 2 5 1 3 1 6 3 5 2 1 2 3 3 4 4 1 4 2
## [3849] 2 5 4 5 3 1 4 5 3 1 1 4 4 4 4 3 3 4 1 2 5 4 2 1 2 4 4 4 4 3 5 5 3 1 2 3 2
## [3886] 3 4 2 4 1 2 3 4 2 2 5 2 1 5 4 1 2 6 1 2 4 4 3 5 6 6 2 4 1 6 3 3 4 2 6 5 3
## [3923] 4 4 2 4 3 4 5 6 2 4 2 4 4 4 5 1 5 5 1 4 1 5 4 1 6 3 1 4 4 5 5 5 2 2 5 5 4
## [3960] 5 4 3 4 6 3 4 3 4 4 3 1 3 3 4 2 2 4 6 5 4 5 5 1 4 5 6 4 4 3 2 3 5 3 1 2 4
## [3997] 4 1 2 2 1 5 4 3 5 2 2 2 1 2 5 4 2 3 1 3 4 4 4 4 2 3 6 4 4 4 2 2 6 5 2 4 3
## [4034] 1 2 2 2 3 2 4 4 3 4 5 5 6 3 1 4 2 5 5 2 6 4 5 1 4 5 4 3 4 5 3 1 4 5 6 1 5
## [4071] 2 5 1 5 4 4 2 4 5 3 5 1 5 2 5 4 2 2 1 2 5 5 2 2 3 5 1 2 1 3 3 3 3 4 4 5 6
## [4108] 5 1 2 2 6 1 3 3 5 3 6 5 6 5 2 5 2 5 6 3 3 4 1 5 2 5 5 4 5 3 4 3 1 1 5 1 5
## [4145] 2 4 4 1 2 5 4 1 1 2 2 5 2 1 5 1 5 4 3 3 5 2 2 6 5 5 5 5 2 2 4 2 5 2 4 5 5
## [4182] 2 5 2 3 3 3 5 2 4 2 3 4 2 5 5 2 4 1 2 2 6 1 1 4 2 2 5 4 1 5 6 5 4 4 1 4 2
## [4219] 2 3 2 1 5 6 2 6 6 6 1 4 4 5 2 4 3 2 1 5 4 2 2 5 5 4 3 4 5 5 1 4 2 4 2 3 5
## [4256] 3 1 3 3 5 4 5 1 1 6 4 5 2 2 2 1 2 2 4 2 1 2 6 2 5 3 1 2 3 3 4 6 2 1 3 1 6
## [4293] 4 3 1 2 5 6 3 5 5 3 5 2 1 4 5 1 5 4 2 2 4 2 1 5 2 2 2 1 4 5 3 1 4 5 4 4 5
## [4330] 3 4 6 4 2 5 4 3 3 1 2 4 6 4 2 1 2 1 6 3 2 5 4 6 4 2 5 3 3 5 5 2 4 2 2 2 2
## [4367] 2 5 5 2 4 6
##
## $centers
## mon_bins rec_bins freq_bins
## 1 2.456845 4.694940 2.514881
## 2 1.546137 2.133047 1.566524
## 3 2.753459 1.942138 2.977358
## 4 4.532882 1.663443 4.966151
## 5 1.179666 4.837047 1.178273
## 6 4.687783 2.022624 2.579186
##
## $totss
## [1] 28750.38
##
## $withinss
## [1] 1063.0625 1167.3938 988.6088 1482.0754 308.9387 350.2081
##
## $tot.withinss
## [1] 5360.287
##
## $betweenss
## [1] 23390.09
head(df_mod6)
## mon_bins rec_bins freq_bins cluster
## 1 1 5 1 5
## 2 5 1 5 4
## 3 4 3 2 6
## 4 4 2 3 6
## 5 2 5 1 5
## 6 3 2 4 3
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
head(df_mod6)
## mon_bins rec_bins freq_bins cluster
## 1 1 5 1 5
## 2 5 1 5 4
## 3 4 3 2 6
## 4 4 2 3 6
## 5 2 5 1 5
## 6 3 2 4 3
nrow(df_mod6)
## [1] 4372
plot_ly(x=df_mod6$rec_bins, y=df_mod6$freq_bins, z=df_mod6$mon_bins, type="scatter3d", mode="markers", color=df_mod6$cluster)
head(df_mod6,15)
## mon_bins rec_bins freq_bins cluster
## 1 1 5 1 5
## 2 5 1 5 4
## 3 4 3 2 6
## 4 4 2 3 6
## 5 2 5 1 5
## 6 3 2 4 3
## 7 1 5 1 5
## 8 3 5 3 1
## 9 2 5 1 5
## 10 5 2 3 6
## 11 5 2 5 4
## 12 3 1 2 3
## 13 5 1 5 4
## 14 5 3 5 4
## 15 1 5 1 5
df_cluster1=df_mod6[df_mod6==1,]
nrow(df_cluster1)
## [1] 3971
df_cluster2=df_mod6[df_mod6==2,]
nrow(df_cluster2)
## [1] 4254
df_cluster3=df_mod6[df_mod6==3,]
nrow(df_cluster3)
## [1] 3559
df_cluster4=df_mod6[df_mod6==4,]
nrow(df_cluster4)
## [1] 1678
df_cluster5=df_mod6[df_mod6==5,]
nrow(df_cluster5)
## [1] 3805
df_cluster6=df_mod6[df_mod6==6,]
nrow(df_cluster6)
## [1] 221
#hierarchical clustering #scaling the variables in hierarchical clustering
df_scale=scale(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],center=TRUE,scale=TRUE)
summary(df_scale)
## rec_bins freq_bins mon_bins
## Min. :-1.1989 Min. :-1.1800 Min. :-1.1951
## 1st Qu.:-1.1989 1st Qu.:-1.1800 1st Qu.:-0.6712
## Median :-0.5400 Median :-0.5097 Median :-0.1472
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7778 3rd Qu.: 0.8310 3rd Qu.: 0.3768
## Max. : 1.4367 Max. : 1.5014 Max. : 1.5994
dist_mat <- dist(df_scale, method = 'euclidean')
#plotting dendrogram
hclust_avg <- hclust(dist_mat, method = 'ward.D2')
plot(hclust_avg)
#creating the required number of clusters
member = cutree(hclust_avg,6)
h_clust=rfm_bin
h_clust$cluster=member
#plotting the clusters formed using hierarchical clustering
library(cluster)
res.pca <- prcomp(h_clust[,c("mon_bins","rec_bins","freq_bins")], scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(h_clust$cluster)
# Data inspection
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
size = 1.5, legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
stat_mean(aes(color = cluster), size = 4)
#shiloutte analysis
df6_scale=data.frame(scale(rfm_bin[,c("mon_bins","rec_bins","freq_bins")],center=TRUE,scale=TRUE))
k_mod6_1=kmeans(df6_scale,centers=6,nstart=25)
k_mod6_1
## K-means clustering with 6 clusters of sizes 829, 872, 1091, 688, 561, 331
##
## Cluster means:
## mon_bins rec_bins freq_bins
## 1 -0.87489870 -0.3714783 -0.81533013
## 2 1.54976773 -0.8512784 1.36225682
## 3 -0.88392573 1.3279575 -0.83286360
## 4 0.12801777 -0.8655820 -0.11505324
## 5 0.39766913 0.6990949 0.03761136
## 6 0.08183663 -0.5897375 1.37381141
##
## Clustering vector:
## [1] 3 2 5 4 3 6 3 5 3 2 2 4 2 2 3 2 3 4 3 1 2 4 5 3 4 4 5 2 5 2 4 5 1 3 2 1 6
## [38] 5 5 2 2 4 5 3 3 1 5 2 4 2 2 5 5 5 3 2 2 5 1 4 5 2 5 5 3 4 2 2 1 2 2 2 5 5
## [75] 3 2 2 3 1 2 1 5 3 1 2 3 2 4 5 1 5 2 5 5 5 4 4 4 4 5 2 2 2 2 5 2 2 4 4 2 2
## [112] 2 2 4 3 2 1 1 3 4 1 1 2 5 2 3 1 3 3 5 4 3 5 1 1 5 3 5 2 2 1 2 3 1 2 2 4 1
## [149] 2 5 1 1 5 5 2 1 2 2 4 1 5 4 3 3 5 3 1 2 6 4 1 3 2 3 2 3 3 1 2 2 1 4 3 3 4
## [186] 6 2 1 3 1 3 2 2 5 1 1 1 1 5 3 3 3 2 1 2 2 4 2 3 1 4 1 3 6 5 2 5 1 2 3 2 5
## [223] 1 1 2 4 2 3 3 3 5 2 2 4 4 1 2 5 2 4 3 4 3 4 5 3 4 4 2 3 3 4 3 5 5 4 2 4 4
## [260] 1 1 2 4 3 3 1 2 5 2 3 4 2 4 2 5 4 2 2 2 2 2 3 2 4 3 4 5 5 4 4 2 5 1 1 2 2
## [297] 3 2 2 5 2 4 2 3 4 5 1 4 2 2 5 4 4 3 2 2 5 3 2 3 3 3 5 3 3 5 1 6 2 2 2 2 3
## [334] 5 2 5 5 1 2 5 6 2 3 5 2 5 3 5 4 4 2 4 2 4 4 1 4 1 3 3 3 3 5 1 3 4 3 1 3 3
## [371] 1 3 5 3 3 3 4 3 5 5 1 6 4 4 3 4 3 1 3 3 2 3 6 2 2 2 5 6 4 3 4 4 3 3 5 6 3
## [408] 2 5 1 3 6 3 2 5 3 1 3 3 3 5 6 4 1 3 4 4 1 5 5 3 4 3 1 1 3 2 3 4 2 1 2 2 4
## [445] 2 3 2 3 4 1 2 3 1 5 4 2 3 1 2 6 2 4 6 1 5 6 5 3 1 3 6 2 2 4 4 4 1 2 3 2 1
## [482] 2 6 1 4 3 6 2 3 5 3 5 1 3 1 4 1 3 2 1 1 1 4 1 4 3 3 2 3 3 2 3 1 3 3 6 2 2
## [519] 2 5 1 2 2 5 4 5 4 4 6 4 3 1 5 5 5 4 3 3 3 1 2 2 1 2 3 5 1 3 3 1 3 4 3 4 1
## [556] 1 2 3 3 5 6 2 1 3 2 5 4 3 2 2 1 1 5 4 1 2 2 3 3 2 1 4 3 4 3 4 6 2 2 3 4 4
## [593] 3 3 5 2 2 1 3 4 3 3 6 5 2 2 4 2 5 2 1 3 1 1 4 2 4 5 1 3 4 4 5 6 1 3 5 4 3
## [630] 4 6 1 1 4 4 2 4 1 2 1 2 2 3 4 3 1 1 5 1 3 2 2 2 4 4 1 6 2 3 4 6 5 3 3 3 5
## [667] 5 3 2 4 3 3 1 3 2 6 5 3 4 4 3 5 3 3 3 1 2 1 3 5 5 1 1 4 1 1 3 3 2 5 2 2 2
## [704] 2 3 6 1 3 1 1 4 1 5 4 1 3 2 3 1 5 3 3 2 1 3 3 5 3 3 2 3 4 6 4 4 4 4 2 2 4
## [741] 2 6 1 1 4 2 1 2 3 4 5 4 2 5 3 5 2 5 3 2 3 1 1 1 4 1 4 5 5 5 3 3 1 2 4 1 2
## [778] 1 3 3 5 1 2 5 5 6 2 3 2 3 3 4 1 3 2 3 4 5 3 3 1 1 1 2 4 3 5 4 4 2 1 3 2 2
## [815] 5 2 1 1 4 4 5 1 2 1 2 3 1 4 2 1 2 2 3 3 4 4 2 1 4 3 5 3 6 2 3 2 1 3 5 5 4
## [852] 3 2 3 1 3 3 5 1 2 3 5 5 3 3 2 4 3 4 4 5 1 1 1 2 3 2 3 4 2 3 3 1 5 4 2 5 1
## [889] 2 6 1 2 4 1 2 1 3 1 3 2 2 4 1 1 6 6 4 3 2 2 1 6 1 3 5 5 3 1 6 3 4 3 2 5 1
## [926] 4 2 4 4 3 5 3 1 2 4 2 5 1 2 1 6 1 2 3 6 1 1 1 6 6 5 6 3 4 3 3 4 3 5 2 2 5
## [963] 2 6 5 4 6 1 5 3 1 2 3 4 3 2 1 4 2 5 3 1 2 5 5 3 1 3 2 1 1 3 4 3 3 2 4 3 3
## [1000] 4 3 1 1 4 3 2 2 1 3 2 3 2 4 3 5 3 2 1 3 3 5 3 6 3 4 5 1 5 3 2 4 1 4 5 3 1
## [1037] 2 6 3 1 3 5 2 4 5 3 3 5 5 3 5 5 3 2 2 2 4 1 4 3 5 6 2 2 2 6 1 2 3 4 3 6 4
## [1074] 3 1 4 4 1 2 5 3 1 5 3 2 1 1 3 4 3 4 3 1 4 5 2 1 1 5 3 6 4 4 3 5 1 3 2 1 3
## [1111] 4 3 6 3 2 4 1 4 5 6 6 5 4 2 3 1 3 4 2 3 1 1 4 4 2 2 2 3 3 6 1 1 5 2 2 4 2
## [1148] 4 3 1 3 3 3 2 1 5 1 2 1 3 4 5 2 1 2 2 4 1 4 1 3 1 2 4 1 1 4 3 1 5 3 3 5 3
## [1185] 3 1 4 4 2 1 1 3 2 4 5 5 4 4 3 6 3 5 1 1 5 3 2 2 1 3 1 2 3 1 5 1 2 2 1 2 1
## [1222] 2 5 3 4 1 2 2 4 2 3 4 5 2 2 5 5 3 3 1 5 5 4 4 3 3 4 3 1 1 2 2 2 4 4 3 3 5
## [1259] 1 2 3 6 5 3 4 3 2 4 2 1 3 1 5 2 2 3 2 2 4 3 5 3 6 1 4 3 1 5 3 1 5 3 6 2 1
## [1296] 2 1 3 2 6 2 1 2 3 5 4 3 5 2 1 1 2 4 2 4 3 4 3 3 2 1 2 1 5 5 4 3 2 3 2 4 1
## [1333] 4 4 3 3 4 2 1 3 3 4 5 5 3 2 4 1 2 5 3 5 5 3 4 5 4 1 2 3 5 2 6 2 1 3 2 2 2
## [1370] 5 2 3 3 4 3 5 1 1 3 3 2 2 2 1 3 4 2 4 2 1 1 3 2 5 5 6 2 2 5 5 2 4 6 2 1 3
## [1407] 3 2 4 3 3 2 5 5 3 3 2 2 5 2 2 3 4 2 3 3 5 3 3 1 1 5 2 3 2 1 4 2 1 3 4 2 2
## [1444] 2 4 1 4 2 2 1 3 3 6 5 2 2 3 4 3 2 1 3 6 3 3 3 2 3 4 1 2 4 4 3 3 3 2 1 5 1
## [1481] 5 3 1 3 3 5 3 1 3 5 1 4 2 1 5 2 3 3 3 3 3 4 1 6 4 1 3 2 2 4 2 3 2 5 2 3 2
## [1518] 5 6 3 5 2 5 6 3 4 3 6 2 1 2 6 1 5 2 3 1 2 1 6 3 2 4 3 3 3 5 2 4 4 6 6 4 4
## [1555] 6 3 6 1 3 2 3 5 3 5 2 3 5 2 4 4 2 1 1 5 3 4 3 1 3 2 3 1 1 6 5 3 3 2 1 1 3
## [1592] 3 2 6 1 1 2 2 6 2 2 6 1 4 3 3 5 2 1 1 1 4 3 2 2 2 1 6 2 5 4 2 2 3 1 1 4 4
## [1629] 6 3 2 2 2 2 3 3 4 1 3 6 2 1 1 2 5 2 2 5 4 1 3 2 5 3 6 1 1 1 6 5 1 3 6 3 6
## [1666] 6 6 4 2 4 1 3 1 3 2 2 4 1 3 1 3 3 2 3 6 1 5 2 3 5 6 4 3 3 1 1 2 2 1 1 5 4
## [1703] 3 2 3 6 6 1 2 2 3 6 1 2 1 1 6 6 6 1 2 5 5 5 4 6 2 3 2 1 3 6 3 2 3 4 2 5 3
## [1740] 2 3 6 3 2 2 2 4 6 3 4 2 6 2 4 2 1 4 4 6 4 3 6 4 3 3 2 4 2 2 2 4 4 2 5 4 4
## [1777] 5 3 4 1 1 1 2 4 1 5 2 3 1 3 1 1 2 1 2 5 3 5 2 1 5 3 1 3 1 1 4 1 4 1 1 4 3
## [1814] 2 1 1 2 1 5 1 4 1 2 2 3 4 3 5 5 1 1 3 3 4 2 5 4 1 4 1 6 3 2 2 2 3 2 3 1 2
## [1851] 1 2 3 4 5 4 1 1 4 1 1 2 5 2 4 3 6 3 4 6 5 1 1 4 5 3 5 3 3 3 3 4 5 2 3 4 6
## [1888] 6 1 2 4 3 2 1 4 2 4 4 1 5 5 5 4 4 3 1 2 4 3 5 5 2 5 2 1 1 4 4 1 2 1 4 3 3
## [1925] 5 1 2 3 6 3 5 1 1 1 5 6 3 5 1 1 5 2 3 1 3 1 3 1 1 3 4 5 5 2 6 3 6 1 6 1 3
## [1962] 1 1 3 2 6 2 2 4 5 4 4 1 6 5 2 2 1 6 6 2 3 3 1 2 3 2 3 6 3 4 6 3 4 6 5 2 6
## [1999] 6 2 5 3 2 4 5 6 5 3 5 4 5 5 5 2 5 1 3 3 4 2 3 1 3 2 5 1 1 3 1 3 1 4 5 2 6
## [2036] 3 5 3 4 4 2 2 1 4 4 3 3 3 1 5 2 1 2 1 6 2 3 4 5 4 3 4 3 5 1 3 2 4 5 3 4 3
## [2073] 2 2 1 1 4 2 1 2 3 1 6 3 3 1 3 2 3 1 3 2 3 5 3 4 1 4 2 2 5 1 2 4 4 3 1 6 2
## [2110] 3 5 1 4 3 5 3 3 2 4 1 2 3 6 3 3 3 3 3 3 4 2 3 3 3 5 3 6 4 4 4 6 1 1 5 3 3
## [2147] 2 2 4 4 3 5 3 3 3 5 3 3 3 1 5 3 4 1 2 2 5 4 1 1 1 3 3 2 1 1 2 4 2 2 3 2 4
## [2184] 2 5 1 2 3 5 5 3 3 2 5 3 3 1 3 4 3 1 2 5 3 3 1 4 4 5 3 5 1 5 5 3 1 4 1 3 3
## [2221] 1 3 3 2 5 1 2 2 1 3 3 5 6 5 5 1 3 2 5 2 3 6 5 4 3 2 3 1 3 3 3 4 1 2 3 3 3
## [2258] 4 4 5 3 3 4 6 3 2 6 3 4 3 2 1 1 5 3 1 3 6 6 1 4 1 5 4 1 6 3 3 5 3 2 6 3 3
## [2295] 3 4 1 4 6 3 3 3 1 3 3 1 5 2 5 4 1 1 4 5 4 1 4 5 2 3 2 2 3 4 2 4 2 6 4 2 4
## [2332] 2 3 1 2 6 3 5 3 6 3 2 2 1 3 2 4 2 4 1 1 6 1 2 3 2 6 2 6 4 3 3 4 1 3 3 2 1
## [2369] 2 6 4 1 3 5 3 2 1 1 4 3 6 1 1 4 5 2 4 2 6 5 4 3 3 5 2 4 6 3 4 3 3 1 1 2 5
## [2406] 1 1 4 2 4 1 1 4 5 3 4 3 1 4 1 2 4 1 6 4 2 3 3 5 5 2 4 2 1 5 1 3 3 2 5 3 5
## [2443] 4 3 3 3 1 1 5 4 3 4 1 5 1 6 3 3 1 5 1 3 4 3 2 2 6 1 2 4 1 6 4 4 3 2 3 1 3
## [2480] 1 2 3 2 1 5 6 3 2 3 5 6 2 3 4 3 4 5 3 2 6 4 2 3 3 4 2 1 3 5 3 3 5 4 2 1 1
## [2517] 4 1 4 3 4 1 5 2 2 3 1 2 6 5 1 3 1 4 2 1 3 2 1 1 3 3 5 3 1 2 1 2 2 6 3 1 4
## [2554] 4 3 2 5 6 6 4 3 1 2 2 4 4 5 4 6 2 5 5 4 5 2 3 4 4 3 3 5 6 5 1 5 5 2 4 1 1
## [2591] 2 4 1 5 3 1 2 1 6 3 1 2 5 2 1 6 6 2 6 1 4 4 3 2 3 4 6 2 6 5 3 3 4 3 1 1 3
## [2628] 3 3 3 3 4 3 4 6 2 1 6 6 3 6 3 6 3 3 6 3 3 3 3 3 1 2 4 3 1 3 5 3 3 3 3 5 4
## [2665] 4 2 4 4 2 2 1 1 1 6 4 4 3 4 3 2 1 5 4 5 5 2 6 2 5 1 6 1 4 1 2 1 6 2 4 2 6
## [2702] 3 2 1 4 3 2 6 2 6 6 6 1 1 2 4 3 6 1 1 1 1 2 3 5 2 3 5 5 4 5 4 4 4 5 1 6 3
## [2739] 1 4 5 3 4 2 6 3 4 3 3 6 4 1 3 3 2 3 4 1 2 5 3 6 5 3 5 3 4 3 3 5 4 4 4 4 1
## [2776] 3 1 4 5 3 6 3 3 3 2 3 3 1 5 6 5 1 3 3 2 4 4 5 2 1 3 5 3 1 5 1 3 2 3 2 5 3
## [2813] 3 1 2 5 4 2 4 3 5 2 3 3 4 2 4 2 1 5 4 5 5 1 3 1 2 1 5 1 5 4 2 2 4 1 1 2 3
## [2850] 1 6 2 3 1 4 6 5 1 4 5 2 1 4 1 5 1 5 3 2 6 5 6 6 3 3 5 3 4 1 4 1 1 2 2 2 4
## [2887] 5 4 3 1 5 3 3 2 2 3 3 2 4 6 3 2 5 5 5 6 3 3 3 3 1 6 5 3 6 6 3 3 2 4 3 1 3
## [2924] 2 3 3 4 5 1 1 3 2 5 1 3 5 1 4 3 6 1 2 2 1 6 2 1 3 3 6 2 1 4 3 1 4 6 3 4 3
## [2961] 1 2 1 3 4 2 4 2 6 4 1 4 4 1 6 4 6 6 6 6 6 3 3 2 5 6 1 5 4 3 5 3 2 6 6 5 4
## [2998] 3 2 3 3 1 3 1 2 2 1 1 1 1 1 4 6 5 2 1 3 3 2 3 1 3 1 3 1 1 6 6 2 1 6 4 1 1
## [3035] 3 3 5 3 1 1 5 2 3 2 6 3 1 4 6 6 1 6 6 3 3 2 1 2 1 4 5 1 3 3 1 1 4 5 5 6 6
## [3072] 2 3 3 1 5 4 4 1 3 3 6 4 6 1 3 1 1 5 2 2 3 1 1 3 4 4 6 4 5 4 1 3 3 2 2 5 3
## [3109] 5 4 1 2 1 2 4 4 5 3 4 5 1 2 6 5 5 1 3 3 3 3 5 3 3 1 1 2 1 1 5 1 1 3 4 1 1
## [3146] 1 2 2 3 3 4 1 3 4 3 2 3 1 5 3 5 3 2 1 2 5 2 3 4 2 4 3 4 3 5 3 1 1 2 4 3 2
## [3183] 2 6 3 3 4 3 1 5 4 6 2 4 2 6 4 2 3 1 5 4 2 5 2 4 1 1 3 5 5 3 5 2 2 2 6 4 5
## [3220] 2 4 2 2 5 1 3 2 3 2 1 3 2 5 5 6 3 6 2 5 1 5 2 1 3 3 1 2 6 2 2 4 4 6 4 3 1
## [3257] 5 6 3 3 1 4 3 4 3 2 3 5 2 4 3 6 2 1 5 2 5 1 1 2 5 3 2 1 6 1 6 2 3 1 6 3 1
## [3294] 6 5 1 4 5 6 5 3 6 3 4 3 2 2 3 6 2 3 3 4 4 1 3 3 1 3 4 3 3 1 2 3 2 2 5 2 1
## [3331] 1 1 3 3 4 1 1 1 3 5 1 1 3 6 2 1 2 2 6 1 6 1 5 3 1 6 1 5 6 6 3 1 3 6 6 1 5
## [3368] 4 2 2 5 6 1 6 2 3 1 4 6 3 6 5 1 2 4 3 3 2 4 4 2 1 4 5 1 2 2 2 5 2 3 3 4 5
## [3405] 5 3 1 2 3 1 3 1 6 3 3 3 4 3 4 4 1 3 1 4 5 1 2 2 2 4 2 1 4 3 6 3 4 3 3 1 2
## [3442] 5 4 4 4 1 1 4 2 2 4 6 1 1 1 3 6 5 1 4 5 4 3 5 1 3 4 6 2 4 3 3 5 5 2 2 3 4
## [3479] 3 4 5 2 3 2 3 4 5 3 2 2 5 2 6 4 4 1 1 1 1 2 5 2 6 2 2 2 2 2 3 4 4 6 4 4 3
## [3516] 5 2 6 3 1 6 4 3 3 4 3 3 3 3 1 5 3 2 4 3 4 3 2 3 3 4 5 4 1 3 1 1 1 1 4 6 2
## [3553] 2 1 2 3 1 4 3 1 2 3 4 3 5 6 1 2 1 3 2 5 1 2 4 3 4 6 1 2 5 1 1 2 3 2 6 3 2
## [3590] 1 2 4 1 3 3 4 2 5 5 2 4 1 3 1 6 4 2 3 5 2 5 3 1 5 4 3 5 1 6 4 3 5 1 3 6 4
## [3627] 3 5 2 1 5 3 1 5 6 1 6 6 6 6 3 2 4 1 3 4 1 4 3 2 3 3 5 3 4 2 2 4 5 6 6 4 3
## [3664] 4 1 3 1 3 5 2 3 2 5 3 1 4 2 1 6 3 4 1 5 4 3 4 3 1 6 1 3 2 2 3 6 1 4 6 4 3
## [3701] 3 1 2 4 2 1 1 1 1 4 4 1 2 3 3 3 4 4 5 2 2 1 2 6 5 3 3 4 4 4 4 1 2 1 2 1 2
## [3738] 4 4 2 1 2 3 1 3 4 5 2 1 3 3 2 3 5 3 1 3 2 2 2 1 3 3 3 5 4 6 1 6 5 2 3 4 1
## [3775] 4 4 5 1 1 4 1 3 1 2 2 3 4 3 1 3 4 3 4 4 5 5 1 3 3 2 1 2 2 1 3 4 6 3 4 1 4
## [3812] 4 4 3 3 4 5 2 2 3 1 3 3 4 1 4 3 6 3 3 1 3 3 4 5 2 6 3 4 5 1 4 4 6 2 3 6 1
## [3849] 1 3 2 3 4 3 2 3 4 6 5 2 6 2 2 5 4 2 5 1 3 2 1 3 1 6 2 2 2 6 3 3 4 3 1 4 1
## [3886] 4 2 1 2 3 4 4 2 1 1 3 1 5 3 2 5 1 5 5 1 2 2 4 3 2 5 4 2 3 4 4 4 2 1 2 3 4
## [3923] 2 2 1 2 4 2 3 5 1 2 1 2 2 2 3 5 3 3 5 2 3 3 2 3 4 4 5 2 2 3 3 3 4 1 3 3 2
## [3960] 3 2 4 2 5 5 2 4 2 2 5 3 4 5 2 4 1 2 2 3 2 3 3 5 2 3 4 2 2 6 1 4 3 4 3 1 2
## [3997] 2 5 1 1 6 3 6 4 3 1 1 1 5 1 3 2 1 4 3 4 6 2 6 6 1 4 2 2 6 2 1 4 5 3 1 2 4
## [4034] 5 4 1 1 4 1 6 2 4 2 3 3 5 5 3 5 1 3 3 4 2 2 3 3 2 3 2 6 2 3 4 3 2 3 5 5 3
## [4071] 1 3 3 3 6 6 1 6 3 4 3 5 3 1 3 6 1 1 5 1 3 3 1 1 5 3 3 4 3 5 4 6 5 2 6 3 4
## [4108] 3 3 1 4 4 5 4 5 3 4 2 3 4 3 1 3 1 3 2 5 4 2 3 3 1 3 3 6 3 6 2 4 3 3 3 3 3
## [4145] 1 6 2 3 1 3 6 5 3 1 1 3 1 3 3 5 3 6 4 4 3 1 1 5 3 3 3 3 1 1 6 1 3 1 6 3 3
## [4182] 1 3 1 6 4 6 3 1 2 1 6 2 1 3 3 1 2 3 1 4 2 5 5 2 1 1 3 2 3 3 5 3 2 2 3 2 4
## [4219] 1 4 1 5 3 5 1 2 5 5 3 6 2 3 1 2 4 1 3 3 2 1 4 3 3 6 4 2 3 3 5 2 1 6 1 4 3
## [4256] 4 3 4 5 3 2 3 5 5 2 2 3 1 1 1 5 4 1 6 4 3 1 4 1 3 4 5 1 4 5 2 5 1 3 4 5 4
## [4293] 2 4 3 1 3 2 5 3 3 5 3 1 6 2 3 5 3 2 1 1 2 1 3 3 1 1 1 3 2 3 4 3 2 3 2 2 3
## [4330] 4 2 2 5 1 3 6 4 4 5 1 2 5 2 1 5 1 3 5 6 1 3 2 4 5 1 3 4 5 3 3 1 2 1 1 1 1
## [4367] 1 3 3 1 2 4
##
## Within cluster sum of squares by cluster:
## [1] 425.9603 365.4587 416.1030 344.4632 577.9936 163.7479
## (between_SS / total_SS = 82.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
head(df6_scale)
## mon_bins rec_bins freq_bins
## 1 -1.1951472 1.4366635 -1.1800234
## 2 1.5994422 -1.1988522 1.5014019
## 3 0.9007949 0.1189056 -0.5096671
## 4 0.9007949 -0.5399733 0.1606893
## 5 -0.4964998 1.4366635 -1.1800234
## 6 0.2021475 -0.5399733 0.8310456
df6_scale$cluster=k_mod6_1$cluster
head(df6_scale)
## mon_bins rec_bins freq_bins cluster
## 1 -1.1951472 1.4366635 -1.1800234 3
## 2 1.5994422 -1.1988522 1.5014019 2
## 3 0.9007949 0.1189056 -0.5096671 5
## 4 0.9007949 -0.5399733 0.1606893 4
## 5 -0.4964998 1.4366635 -1.1800234 3
## 6 0.2021475 -0.5399733 0.8310456 6
library(cluster)
s =silhouette(df6_scale$cluster, dist(df6_scale[,-4],method = 'euclidean'))
plot(s, col=1:3, border=NA)
#Silhoutte analysis for hierarchical clustering
head(h_clust)
## custid mon_bins freq_bins rec_bins tot_score cluster
## 1 12346 1 1 5 2.333333 1
## 2 12347 5 5 1 3.666667 2
## 3 12348 4 2 3 3.000000 3
## 4 12349 4 3 2 3.000000 3
## 5 12350 2 1 5 2.666667 1
## 6 12352 3 4 2 3.000000 3
s =silhouette(h_clust$cluster, dist(df6_scale,method = 'euclidean'))
plot(s, col=1:3, border=NA)
#the average silhoutte width is .33 # which is lesser thaan kmeans
#Increasing the number of clusters further
k_mod8=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=8)
df_mod8=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod8$cluster)
df_mod8$cluster=as.factor(df_mod8$cluster)
Accuracy of the cluster with number of clusters 8=0.8483222
k_mod9=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=9)
df_mod9=cbind(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],"cluster"=k_mod9$cluster)
df_mod9$cluster=as.factor(df_mod9$cluster)
k_mod10=kmeans(rfm_bin[,c("rec_bins","freq_bins","mon_bins")],centers=10)
df_mod10=cbind(rfm_bin[,c("custid","rec_bins","freq_bins","mon_bins")],"cluster"=k_mod10$cluster)
df_mod10$cluster=as.factor(df_mod10$cluster)
head(df_mod10)
## custid rec_bins freq_bins mon_bins cluster
## 1 12346 5 1 1 4
## 2 12347 1 5 5 7
## 3 12348 3 2 4 1
## 4 12349 2 3 4 8
## 5 12350 5 1 2 4
## 6 12352 2 4 3 5
#customer segmentation by clustering into 10 clusters #recency high(new customers) ,frequency low ,monetory all types
df_cluster1=df_mod10[df_mod10$cluster==1,] #(x4-5x) high recency
head(df_cluster1,20)
## custid rec_bins freq_bins mon_bins cluster
## 3 12348 3 2 4 1
## 8 12354 5 3 3 1
## 23 12372 3 3 3 1
## 27 12377 5 3 4 1
## 29 12379 3 2 3 1
## 38 12393 3 3 3 1
## 39 12394 3 2 3 1
## 43 12399 4 3 3 1
## 47 12405 5 3 4 1
## 52 12410 5 2 3 1
## 53 12412 3 3 3 1
## 54 12413 3 2 3 1
## 58 12418 4 2 3 1
## 61 12422 4 2 3 1
## 63 12424 5 2 4 1
## 64 12425 3 2 3 1
## 73 12434 3 3 3 1
## 74 12435 3 2 5 1
## 82 12446 3 3 3 1
## 89 12453 4 3 3 1
str(df_cluster1)
## 'data.frame': 480 obs. of 5 variables:
## $ custid : num 12348 12354 12372 12377 12379 ...
## $ rec_bins : num 3 5 3 5 3 3 3 4 5 5 ...
## $ freq_bins: num 2 3 3 3 2 3 2 3 3 2 ...
## $ mon_bins : num 4 3 3 4 3 3 3 3 4 3 ...
## $ cluster : Factor w/ 10 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
library(plyr)
# count(df_cluster1$rec_bins)
# count(df_cluster1$freq_bins)
# count(df_cluster1$mon_bins)
nrow(df_cluster1)
## [1] 480
#cluster 2 frequency high,recency high, monetory moderate to high,best customers
df_cluster2=df_mod10[df_mod10$cluster==2,]
tail(df_cluster2,20)
## custid rec_bins freq_bins mon_bins cluster
## 2871 16222 5 5 3 2
## 3089 16520 5 5 3 2
## 3224 16714 5 4 3 2
## 3234 16725 5 5 3 2
## 3295 16801 5 5 3 2
## 3358 16889 5 5 3 2
## 3382 16919 5 5 5 2
## 3531 17126 5 5 3 2
## 3634 17282 4 4 4 2
## 3669 17337 5 5 5 2
## 3673 17341 5 5 3 2
## 3725 17406 5 5 5 2
## 3754 17444 5 5 5 2
## 3777 17472 5 5 3 2
## 3795 17504 5 5 5 2
## 4009 17787 5 5 4 2
## 4049 17850 5 5 5 2
## 4282 18168 5 5 3 2
## 4333 18231 5 5 5 2
## 4354 18260 5 5 5 2
nrow(df_cluster2)
## [1] 50
# count(df_cluster2$rec_bins)
# count(df_cluster2$freq_bins)
# count(df_cluster2$mon_bins)
#cluster 3 average customers , low to average recency, frequency and spending capability
df_cluster3=df_mod10[df_mod10$cluster==3,]
nrow(df_cluster3)
## [1] 1001
# count(df_cluster3$rec_bins)
# count(df_cluster3$freq_bins)
# count(df_cluster3$mon_bins)
View(df_cluster3)
#cluster 4 recent buying customers,new
df_cluster4=df_mod10[df_mod10$cluster==4,]
nrow(df_cluster4)
## [1] 1091
head(df_cluster4,15)
## custid rec_bins freq_bins mon_bins cluster
## 1 12346 5 1 1 4
## 5 12350 5 1 2 4
## 7 12353 5 1 1 4
## 9 12355 5 1 2 4
## 15 12361 5 1 1 4
## 17 12363 4 2 2 4
## 19 12365 5 2 2 4
## 24 12373 5 1 2 4
## 34 12386 5 1 2 4
## 44 12401 5 1 1 4
## 45 12402 5 1 1 4
## 55 12414 5 2 2 4
## 65 12426 5 2 2 4
## 75 12436 4 1 2 4
## 78 12441 5 1 1 4
# count(df_cluster4$rec_bins)
# count(df_cluster4$freq_bins)
# count(df_cluster4$mon_bins)
#cluster 5 frequently buying customers loyal
df_cluster5=df_mod10[df_mod10$cluster==5,]
nrow(df_cluster5)
## [1] 256
# count(df_cluster5$rec_bins)
# count(df_cluster5$freq_bins)
# count(df_cluster5$mon_bins)
tail(df_cluster5,15)
## custid rec_bins freq_bins mon_bins cluster
## 4078 17886 2 5 3 5
## 4102 17917 2 4 2 5
## 4135 17961 2 5 3 5
## 4137 17964 2 5 2 5
## 4146 17974 2 5 3 5
## 4151 17979 2 5 3 5
## 4175 18016 2 5 3 5
## 4179 18022 2 5 3 5
## 4185 18034 2 5 2 5
## 4187 18036 2 4 2 5
## 4192 18043 2 5 2 5
## 4230 18096 1 5 3 5
## 4252 18125 2 5 3 5
## 4274 18156 1 5 3 5
## 4349 18252 2 4 2 5
#cluster 6 high paying customers having high frequency of buying
df_cluster6=df_mod10[df_mod10$cluster==6,]
tail(df_cluster6,15)
## custid rec_bins freq_bins mon_bins cluster
## 2582 15827 3 5 4 6
## 2703 16007 2 5 4 6
## 2828 16170 2 5 4 6
## 3041 16455 3 4 4 6
## 3084 16515 3 5 4 6
## 3170 16638 2 4 4 6
## 3288 16791 2 5 4 6
## 3327 16841 2 5 4 6
## 3449 17015 2 5 4 6
## 3469 17043 2 4 4 6
## 3508 17092 2 5 4 6
## 3553 17162 2 5 4 6
## 3635 17284 3 5 4 6
## 3977 17738 2 4 4 6
## 4336 18235 3 5 4 6
nrow(df_cluster6)
## [1] 35
# count(df_cluster6$rec_bins)
# count(df_cluster6$freq_bins)
# count(df_cluster6$mon_bins)
#cluster 7 slipping ….high paying customers having high frequency of buying in the past but dont buy now
df_cluster7=df_mod10[df_mod10$cluster==7,]
tail(df_cluster7,15)
## custid rec_bins freq_bins mon_bins cluster
## 4266 18145 1 5 5 7
## 4286 18172 1 5 5 7
## 4293 18180 1 5 4 7
## 4306 18198 1 5 5 7
## 4313 18210 1 5 5 7
## 4321 18219 1 5 5 7
## 4325 18223 1 5 5 7
## 4327 18225 1 5 5 7
## 4328 18226 2 5 5 7
## 4331 18229 1 5 5 7
## 4341 18241 1 5 5 7
## 4343 18245 1 5 5 7
## 4352 18257 2 5 5 7
## 4362 18272 1 5 5 7
## 4371 18283 1 5 5 7
nrow(df_cluster6)
## [1] 35
# count(df_cluster7$rec_bins)
# count(df_cluster7$freq_bins)
# count(df_cluster7$mon_bins)
#cluster 8 low frequency customers who rarely buy but consist of low to high spenders
df_cluster8=df_mod10[df_mod10$cluster==8,]
tail(df_cluster8,15)
## custid rec_bins freq_bins mon_bins cluster
## 4265 18144 1 3 5 8
## 4278 18161 1 3 4 8
## 4281 18167 1 3 3 8
## 4284 18170 2 2 3 8
## 4290 18177 2 3 3 8
## 4292 18179 1 3 4 8
## 4298 18188 1 3 5 8
## 4323 18221 2 3 3 8
## 4330 18228 2 2 3 8
## 4332 18230 1 3 5 8
## 4337 18236 2 3 3 8
## 4338 18237 1 3 3 8
## 4353 18259 2 2 5 8
## 4357 18263 2 3 3 8
## 4372 18287 2 3 4 8
nrow(df_cluster8)
## [1] 607
# count(df_cluster8$rec_bins)
# count(df_cluster8$freq_bins)
# count(df_cluster8$mon_bins)
#cluster 9 recent buyers frquent buyers, high spenders ,best customers
df_cluster9=df_mod10[df_mod10$cluster==9,]
tail(df_cluster9,15)
## custid rec_bins freq_bins mon_bins cluster
## 3348 16875 3 5 5 9
## 3388 16928 3 5 5 9
## 3397 16940 3 5 5 9
## 3428 16984 3 5 5 9
## 3476 17050 4 5 5 9
## 3484 17061 3 4 5 9
## 3720 17400 4 5 5 9
## 3800 17509 3 5 5 9
## 3860 17589 3 5 5 9
## 3862 17591 3 5 5 9
## 3876 17612 3 5 5 9
## 3918 17667 3 5 5 9
## 3963 17722 4 5 5 9
## 4060 17863 3 5 5 9
## 4310 18204 3 5 5 9
nrow(df_cluster9)
## [1] 91
# count(df_cluster9$rec_bins)
# count(df_cluster9$freq_bins)
# count(df_cluster9$mon_bins)
df_cluster10=df_mod10[df_mod10$cluster==10,]
tail(df_cluster10,15)
## custid rec_bins freq_bins mon_bins cluster
## 3587 17214 3 5 3 10
## 3622 17259 5 5 2 10
## 3625 17265 4 5 2 10
## 3858 17585 4 5 3 10
## 3878 17614 3 4 2 10
## 4001 17774 4 5 3 10
## 4020 17802 3 5 3 10
## 4082 17890 5 4 2 10
## 4086 17894 3 5 3 10
## 4089 17897 5 5 2 10
## 4105 17921 3 5 3 10
## 4162 17997 3 5 3 10
## 4244 18116 3 5 3 10
## 4291 18178 4 4 3 10
## 4305 18196 4 5 3 10
nrow(df_cluster10)
## [1] 72
# count(df_cluster10$rec_bins)
# count(df_cluster10$freq_bins)
# count(df_cluster10$mon_bins)
#high valued customers have RFM score of 5,5,5
df_mod10$customerID=rfm_bin$custid
df_mod10[ df_mod6$rec_bins==5 & df_mod6$freq_bins==5 & df_mod6$mon_bins==5 ,]
## custid rec_bins freq_bins mon_bins cluster customerID
## 124 12501 5 5 5 2 12501
## 573 13093 5 5 5 2 13093
## 1196 13952 5 5 5 2 13952
## 1242 14016 5 5 5 2 14016
## 1564 14461 5 5 5 2 14461
## 2135 15235 5 5 5 2 15235
## 2243 15379 5 5 5 2 15379
## 2460 15665 5 5 5 2 15665
## 2567 15808 5 5 5 2 15808
## 3382 16919 5 5 5 2 16919
## 3669 17337 5 5 5 2 17337
## 3725 17406 5 5 5 2 17406
## 3754 17444 5 5 5 2 17444
## 3795 17504 5 5 5 2 17504
## 4049 17850 5 5 5 2 17850
## 4333 18231 5 5 5 2 18231
## 4354 18260 5 5 5 2 18260
#plot with 10 clusters
res.pca <- prcomp(df_mod10[,c("mon_bins","rec_bins","freq_bins")], scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(df_mod10$cluster)
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
size = 1.5, legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
stat_mean(aes(color = cluster), size = 4)